import os
import networkx as nx
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import Point
import math
import time
from matplotlib_scalebar.scalebar import ScaleBar
from mpl_toolkits.axes_grid1 import make_axes_locatable
# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
plt.rcParams['figure.dpi'] = 450 # increase resolution
# Configure Notebook
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
First import notebook from previously exported raw data file
trips_data = pd.read_csv('trips_raw_data.csv')
trips_data.head()
trips_data.info()
Converting Start Time and End Time to Datetime instead of object
trips_data['Start Time'] = pd.DatetimeIndex(trips_data['Start Time']).tz_convert(tz = 'EST')
trips_data['End Time'] = pd.DatetimeIndex(trips_data['End Time']).tz_convert(tz = 'EST')
trips_data.info()
trips_data.head()
Since before 2018 user data is incomplete, we will exclude data before 2018 from Start Time
trips_data = trips_data[trips_data['Start Time']>'2018-01-01']
trips_data.head()
In this section, casual and annual members are compared regardless of the weather conditions to provide a general overview and behaviour differences with a Frequency of Days
trips_data_day = trips_data
trips_data_day=trips_data_day.groupby([pd.Grouper(key='Start Time', freq='D')]).agg(
rides=pd.NamedAgg(column='User Type', aggfunc=lambda x: (x.count())),
annual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Annual Member'].count())),
casual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Casual Member'].count())))
trips_data_day['rides'] = trips_data_day['rides'].astype('Int64')
trips_data_day['annual_members'] = trips_data_day['annual_members'].astype('Int64')
trips_data_day['casual_members'] = trips_data_day['casual_members'].astype('Int64')
trips_data_day.head()
Adding workday indicator
trips_data_day['workday'] = np.where((trips_data_day.index.dayofweek) < 5,'True','False')
trips_data_day = trips_data_day.dropna()
trips_data_day.head()
First, a hue semantic to show distrubtion
import matplotlib.patches as mpatches
plt.figure(figsize=(7, 5))
trips_data_day['casual_members']=trips_data_day['casual_members'].astype('float64')
trips_data_day['annual_members']=trips_data_day['annual_members'].astype('float64')
plt.title('Comparison of Casual Members and Annual Members \n on Working and Non-working days', fontsize=18)
ax=sns.kdeplot(data=trips_data_day, x="casual_members", y="annual_members", hue="workday")
ax.set_xlabel('Casual Membership',fontsize=12)
ax.set_ylabel('Annual Membership',fontsize=12)
plt.show()
Combining with scatter plot to show distribution
plt.figure(figsize=(7, 5))
plt.title('Comparison of Casual Members and Annual Members \n on Working and Non-working days', fontsize=18)
bx=sns.scatterplot(data=trips_data_day, x="casual_members", y="annual_members",hue="workday")
handles, labels = ax.get_legend_handles_labels()
bx.legend( title='workday')
bx.set_xlabel('Casual Membership',fontsize=12)
bx.set_ylabel('Annual Membership',fontsize=12)
plt.show()
Adding violin plot to further illustrate the distribution for both annual and casual members
plt.figure(figsize=(8, 5))
plt.title('Comparison of Annual Members \n on Working and Non-working days', fontsize=18)
cx1 = sns.violinplot(x="workday", y="annual_members", data=trips_data_day)
cx1.set_xlabel('Workday',fontsize=16)
cx1.set_ylabel('Annual Rides per Day',fontsize=16)
plt.show()
plt.figure(figsize=(8, 5))
plt.title('Comparison of Casual Members \n on Working and Non-working days', fontsize=18)
cx2 = sns.violinplot(x="workday", y="casual_members", data=trips_data_day)
cx2.set_xlabel('Workday',fontsize=16)
cx2.set_ylabel('Annual Rides per Day',fontsize=16)
plt.show()
From the first two graphs above, it is evident that there are more annual membership users utilizing the bike share service on weekdays, suggesting that these users are using the service for possible commuting to and from work. On the contrary, for casual members, the patterns are more spread out with a higher usage during the weekends, suggesting that casual users are utilizing the service for possible leisure purposes without specific pattern.
This is further demonstrated through the violin plot distribution graphs, where it is evident to see that for weekdays, annual membership users have a more even distributed rides per day in comparsion to casual members suggesting that casual members are less likely to use this service on weekdays. However, the probability for casual members to use the bike share service increased for week-ends, comparing to annual users where weekday and weekend showed less variations in the probability of ridership.
However, in order to further justify the assumption that annual users are utilizing for commuting purposes, hours of use are examined below.
To further examine the purposes of each member type, the start time is grouped together at hourly interval
trips_data_hour = trips_data
trips_data_hour=trips_data_hour.groupby([pd.Grouper(key='Start Time', freq='h')]).agg(
rides=pd.NamedAgg(column='User Type', aggfunc=lambda x: (x.count())),
annual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Annual Member'].count())),
casual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Casual Member'].count())))
trips_data_hour['rides'] = trips_data_hour['rides'].astype('Int64')
trips_data_hour['annual_members'] = trips_data_hour['annual_members'].astype('Int64')
trips_data_hour['casual_members'] = trips_data_hour['casual_members'].astype('Int64')
trips_data_hour.head()
Since for this comparison date does not matter, each hour will be grouped together and taking the average of the trips taken by each user type
trips_data_hour['datetime']=trips_data_hour.index
trips_data_hour['time']=trips_data_hour['datetime'].dt.hour
trips_data_hour = trips_data_hour.drop(columns=['datetime']).reset_index()
trips_data_hour = trips_data_hour.groupby(by='time').mean()
trips_data_hour.head()
Now plotting a distribution graph to show time of day and usage comparison
plt.figure(figsize=(7, 5))
plt.title('Average hourly rides per day', fontsize=18)
dx1=sns.lineplot(data=trips_data_hour, x='time', y="annual_members",label='Annual Members')
dx2=sns.lineplot(data=trips_data_hour, x='time', y="casual_members",label='Casual Members')
dx1.set_xlabel('Hour of the Day',fontsize=12)
dx1.set_ylabel('Average Rides per Hour',fontsize=12)
plt.show()
This graph clearly indicated 2 peak time for annual m embers at Hour 8 and Hour 17, aligning with rush hour times and matches with the expectation that most annual members are utilziing bike share service for the purpose of commuting to and from work.
On the contrary, there is a lack of a significant peak hour for casual members aside from the gradual increase between Hour 12 and Hour 19 which suggests a more evenly spread out use such as for leisure or a specific purpose that is not time dependant.
To further examine ridership type behaviours, weather factors are included
First checking whether there are any null values in the Weather column
trips_data.isnull().sum(axis=0).to_frame('count')
trips_data.groupby('Weather')['Trip Id'].count().sort_values(ascending=False)
From above since there are no clear indication, the assumption is that for the null values in weather those will be clear
The dataset will be grouped into hourly breakdown with ridership type, max temperature and weather condition
trips_data_w= trips_data
trips_data_w['Weather'].fillna('clearday', inplace=True)
trips_data_w=trips_data_w.groupby([pd.Grouper(key='Start Time', freq='D')]).agg(
rides=pd.NamedAgg(column='User Type', aggfunc=lambda x: (x.count())),
annual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Annual Member'].count())),
casual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Casual Member'].count())),
weather_count= pd.NamedAgg(column='Weather', aggfunc=lambda x: (x.count())),
weather_rain_count = pd.NamedAgg(column='Weather', aggfunc=lambda x: (x[x=='clearday'].count()))
)
trips_data_w['rides'] = trips_data_w['rides'].astype('Int64')
trips_data_w['annual_members'] = trips_data_w['annual_members'].astype('Int64')
trips_data_w['casual_members'] = trips_data_w['casual_members'].astype('Int64')
trips_data_w['weather_per']= trips_data_w[['weather_rain_count']].div(trips_data_w.weather_count,axis=0)
trips_data_w['weather_per']=trips_data_w['weather_per']*100
trips_data_w['weather']= np.where(trips_data_w['weather_per']>= 50.0 , 'Clear', 'Precipitation')
trips_data_w = trips_data_w[['rides','annual_members','casual_members','weather']]
# View DataFrame
trips_data_w.head(10)
Dropping off any time where rides is 0 and NA
trips_data_w_d =trips_data_w[(trips_data_w[['rides']] != 0).all(axis=1)]
trips_data_w_d=trips_data_w_d.dropna()
trips_data_w_d.head(10)
First a box plot is used to compare behaviour changes due to weather conditions for all riders
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Total Ridership', fontsize=18)
trips_data_w_d['rides']=trips_data_w_d['rides'].astype('float64')
ex = sns.boxplot(x="weather", y="rides", data=trips_data_w_d)
ex.set_ylim(0,16000)
ex.set_xlabel('Weather Conditions',fontsize=16)
ex.set_ylabel('Rides per Day',fontsize=16)
plt.show()
Adding a violin plot
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Total Ridership', fontsize=18)
trips_data_w_d['rides']=trips_data_w_d['rides'].astype('float64')
fx = sns.violinplot(x="weather", y="rides", data=trips_data_w_d)
fx.set_ylim(0,16000)
fx.set_xlabel('Weather Conditions',fontsize=16)
fx.set_ylabel('Rides per Day',fontsize=16)
plt.show()
From a total ridership perspective, it is evident that when the weather is clear, there is a more even distribution of riders per day ranging from 2000 to 10,000. Whereas when there is precipitation, the probability of having a higher ridership per day sigifnicantly decreased and that the probability of low ridership per day (in this case around 2000 riders) is the highest by a significant margin as well.
To further explore this, annual vs casual members are examined individually
Annual members are first looked at using both box plot and violin plot
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Annual Ridership', fontsize=18)
trips_data_w_d['annual_members']=trips_data_w_d['annual_members'].astype('float64')
gx = sns.boxplot(x="weather", y="annual_members", data=trips_data_w_d)
gx.set_ylim(0,16000)
gx.set_xlabel('Weather Conditions',fontsize=16)
gx.set_ylabel('Annual Member Rides per Day',fontsize=16)
plt.show()
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Annual Ridership', fontsize=18)
hx = sns.violinplot(x="weather", y="annual_members", data=trips_data_w_d)
hx.set_ylim(0,16000)
hx.set_xlabel('Weather Conditions',fontsize=16)
hx.set_ylabel('Annual Member Rides per Day',fontsize=16)
plt.show()
Now repeat this exercise for casual members
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Casual Ridership', fontsize=18)
trips_data_w_d['casual_members']=trips_data_w_d['casual_members'].astype('float64')
ix = sns.boxplot(x="weather", y="casual_members", data=trips_data_w_d)
ix.set_xlabel('Weather Conditions',fontsize=16)
ix.set_ylabel('Annual Member Rides per Day',fontsize=16)
plt.show()
plt.figure(figsize=(8, 5))
plt.title('Effect of Weather on Casual Ridership', fontsize=18)
trips_data_w_d['casual_members']=trips_data_w_d['casual_members'].astype('float64')
jx = sns.violinplot(x="weather", y="casual_members", data=trips_data_w_d)
jx.set_xlabel('Weather Conditions',fontsize=16)
jx.set_ylabel('Casual Member Rides per Day',fontsize=16)
plt.show()
Based on the boxplot and the violin plot above, it is evident that weather does have a significant impact on the probability of ridership per day for both casual and annual members.
Comparing the two types of riders, casual members are more impacted by weather conditions than annual members, especially the range of the probability of of ridership where with clear sky the probability ranges to 13,000 riders, and for precipitation days the range decreased to 9,000 per day. However, the probability range for annual members did not vary much, suggesting that there is a higher group of riders with annual members are not impacted by precipitation.
In this section, in addition to precipitation, other weather factors are explored to determine its impact on ridership
To explore additional factors, a new dataframe is used to incorporate max temperature, min temperature, average relative humidity, and mwean wind speed on a daily basis.
trips_data_wa= trips_data
trips_data_wa=trips_data_wa.groupby([pd.Grouper(key='Start Time', freq='D')]).agg(
rides=pd.NamedAgg(column='User Type', aggfunc=lambda x: (x.count())),
annual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Annual Member'].count())),
casual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Casual Member'].count())),
max_temp = pd.NamedAgg(column='Temp (°C)', aggfunc=lambda x: (x.max())),
min_temp = pd.NamedAgg(column='Temp (°C)', aggfunc=lambda x: (x.min())),
rel_hum= pd.NamedAgg(column='Rel Hum (%)', aggfunc=lambda x: (x.mean())),
wind_speed= pd.NamedAgg(column='Wind Spd (km/h)', aggfunc=lambda x: (x.mean())),
)
trips_data_wa['rides'] = trips_data_wa['rides'].astype('Int64')
trips_data_wa['annual_members'] = trips_data_wa['annual_members'].astype('Int64')
trips_data_wa['casual_members'] = trips_data_wa['casual_members'].astype('Int64')
trips_data_wa['max temperature'] = trips_data_wa['max_temp']
trips_data_wa['min temperature'] = trips_data_wa['min_temp']
trips_data_wa['average relative humidity'] = trips_data_wa['rel_hum']
trips_data_wa['average wind speed'] = trips_data_wa['wind_speed']
trips_data_wa = trips_data_wa[['rides','annual_members','casual_members','max temperature','min temperature','average relative humidity','average wind speed']]
# View DataFrame
trips_data_wa.head(10)
As well as droping any 0 ridership data and NA
trips_data_wa_d=trips_data_wa[(trips_data_w[['rides']] != 0).all(axis=1)]
trips_data_wa_d = trips_data_wa_d.dropna()
trips_data_wa_d.head()
Now we can take a look at temperature influence on ridership through joint plot First looking at max temperature
plt.figure(figsize=(8, 5))
plt.title('Minimum Temperatue Influence on Ridership', fontsize=18)
kxma1= sns.scatterplot(data=trips_data_wa_d, x="max temperature", y="annual_members",label='Annual Members')
kxma2= sns.scatterplot(data=trips_data_wa_d, x="max temperature", y="casual_members",label='Casual Members')
plt.legend()
kxma1.set_xlabel('Maximum Daily Temperature',fontsize=16)
kxma2.set_ylabel('Rides per Day',fontsize=16)
plt.show()
plt.figure(figsize=(8, 5))
plt.title('Maximum Temperatue Influence on Ridership', fontsize=18)
kxmi1= sns.scatterplot(data=trips_data_wa_d, x="min temperature", y="annual_members",label='Annual Members')
kxmi2= sns.scatterplot(data=trips_data_wa_d, x="min temperature", y="casual_members",label='Casual Members')
plt.legend()
kxmi1.set_xlabel('Minimum Daily Temperature',fontsize=16)
kxmi2.set_ylabel('Rides per Day',fontsize=16)
plt.show()
After temperature, relative humidity is examined using scatter plot
plt.figure(figsize=(8, 5))
plt.title('Relative Humidity Influence on Ridership', fontsize=18)
lx1= sns.scatterplot(data=trips_data_wa_d, x="average relative humidity", y="annual_members",label='Annual Members')
lx2= sns.scatterplot(data=trips_data_wa_d, x="average relative humidity", y="casual_members",label='Casual Members')
plt.legend()
lx1.set_xlabel('Average Daily Relative Humidity',fontsize=16)
lx1.set_ylabel('Rides per Day',fontsize=16)
plt.show()
Lastly, max wind speed per day is also examined:
plt.figure(figsize=(8, 5))
plt.title('Wind Speed Influence on Ridership', fontsize=18)
mx1= sns.scatterplot(data=trips_data_wa_d, x="average wind speed", y="annual_members",label='Annual Members')
mx2= sns.scatterplot(data=trips_data_wa_d, x="average wind speed", y="casual_members",label='Casual Members')
plt.legend()
mx1.set_xlabel('Average Daily Wind Speed',fontsize=16)
mx1.set_ylabel('Rides per Day',fontsize=16)
plt.show()
From the above scatter plots we can see that both temperature (whether max vs min ) and average wind speed has a significant impact on daily ridership foor both annual and casual members. Where with a higher temperature there will be more ridership, and on colder days the number of riders reduced. This is also evident for wind speed where with higher wind speed it may be more difficult to ride outside comparing with low wind speed.
On the contrary, relative humidity does not have a significant impact on ridership for both user groups which aligns with the expectation that bike riders are not concerned with relative humidity, only precipitation.
This session will examine whether the pandemic has influence bike sharing behaviour
pandemic_data = pd.read_csv('COVID19 cases Toronto.csv')
pandemic_data.head()
pandemic_data.info()
We will be using episode data as our date for comparison, but we need to convert to datetime first and localize
pandemic_data['Episode Date'] = pd.DatetimeIndex(pandemic_data['Episode Date']).tz_localize(tz = 'UTC').tz_convert(tz = 'EST')
pandemic_data.info()
Checking NA values
pandemic_data.isnull().sum(axis=0).to_frame('count')
Now grouping the data, but before doing so let's examine what frequency is best using the bike share data first
# Using the above trips_data_day dataframe to evaluate trips data on a frequency of days
plt.figure(figsize=(10, 5))
plt.title('Daily ridership', fontsize=18)
rdx=sns.lineplot(data=trips_data_day, x='Start Time', y="rides",label='total rides')
rdx.set_xlabel('Day',fontsize=12)
rdx.set_ylabel('Rides per day',fontsize=12)
plt.show()
The above graph seems crowded with too many data points, thus a monthly frequency is adopted
trips_data_month = trips_data
trips_data_month=trips_data_month.groupby([pd.Grouper(key='Start Time', freq='M')]).agg(
rides=pd.NamedAgg(column='User Type', aggfunc=lambda x: (x.count())),
annual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Annual Member'].count())),
casual_members = pd.NamedAgg(column='User Type', aggfunc=lambda x: (x[x=='Casual Member'].count())))
trips_data_month['rides'] = trips_data_month['rides'].astype('Int64')
trips_data_month['annual_members'] = trips_data_month['annual_members'].astype('Int64')
trips_data_month['casual_members'] = trips_data_month['casual_members'].astype('Int64')
trips_data_month.head(13)
plt.figure(figsize=(10, 5))
plt.title('Monthly ridership', fontsize=18)
rmx=sns.lineplot(data=trips_data_month, x='Start Time', y="rides",label='rides')
rmx.set_xlabel('Month',fontsize=12)
rmx.set_ylabel('Rides per month',fontsize=12)
plt.show()
This is more clear, therefore pandemic data will also be using a frequency of month
pandemic_data_month = pandemic_data
pandemic_data_month=pandemic_data_month.groupby([pd.Grouper(key='Episode Date', freq='M')]).agg(
cases=pd.NamedAgg(column='Assigned_ID', aggfunc=lambda x: (x.count())))
pandemic_data_month['cases'] = pandemic_data_month['cases'].astype('Int64')
pandemic_data_month.head(10)
Since there are NA in the column above, let's check our dataframe and see if indeed there are dates where no patience recorded on that day
pandemic_data.sort_values(by='Episode Date', ascending=True,inplace = True)
pandemic_data.head(10)
The above verified that there are dates without episode date, suggesting no patient has been recorded, so we will change NA to 0
pandemic_data_month['cases'].fillna(0, inplace=True)
pandemic_data_month.head(100)
Let's take a brief look at what the pandemic looks like
plt.figure(figsize=(7, 5))
plt.title('Toronto Monthly Pandemic cases', fontsize=18)
nx= sns.lineplot(data=pandemic_data_month, x='Episode Date', y="cases")
nx.set_xlabel('Date',fontsize=12)
nx.set_ylabel('Cases per month',fontsize=12)
plt.show()
In this section, we will be using the previously developed dataframe with daily ridership after 2018 and joining with Toronto Pandemic Data
# will be using trips_data_day dataframe developed as part of the general analysis from above
#reset both index and create column called merge_time
trips_data_month_m=trips_data_month
trips_data_month['merge_time'] = trips_data_month.index
trips_data_month=trips_data_month.reset_index(drop=True)
pandemic_data_month_m = pandemic_data_month
pandemic_data_month_m['merge_time']= pandemic_data_month_m.index
pandemic_data_month_m=pandemic_data_month_m.reset_index(drop=True)
# merging two dataframes
bike_pandemic_data = pd.merge(trips_data_month_m, pandemic_data_month, how ='left', on=['merge_time'])
bike_pandemic_data.fillna(0, inplace=True)
bike_pandemic_data.head()
plt.figure(figsize=(10, 5))
plt.title('Comparing Bike Share Data with Pandemic Cases per Month', fontsize=18)
bpm=sns.lineplot(data=bike_pandemic_data, x='merge_time', y='rides', label = 'Rides')
bpm=sns.lineplot(data=bike_pandemic_data, x='merge_time', y='cases',label = 'Cases')
bpm.set_xlabel('Date',fontsize=12)
bpm.set_ylabel('Rides/cases per month',fontsize=12)
plt.show()
The above graph does not suggest much correlation regarding the impact pandemic has on ridership, however, since the data is not scaled, we will now scale the data for a better visualization
bike_pandemic_data_scale=bike_pandemic_data
bike_pandemic_data_scale.index= bike_pandemic_data_scale['merge_time']
bike_pandemic_data_scale=bike_pandemic_data_scale.drop(columns=['merge_time'])
bike_pandemic_data_scale.head()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
bike_pandemic_data_scale_mm = bike_pandemic_data_scale
bike_pandemic_data_scale_mm[['rides', 'annual_members','casual_members','cases']] = scaler.fit_transform(bike_pandemic_data_scale_mm[['rides', 'annual_members','casual_members','cases']])
bike_pandemic_data_scale_mm.head()
plt.figure(figsize=(10, 5))
plt.title('Comparing Bike Share Data with Pandemic Cases per Month', fontsize=18)
bpmmm=sns.lineplot(data=bike_pandemic_data_scale_mm, x='merge_time', y='rides', label = 'Rides')
bpmmm =sns.lineplot(data=bike_pandemic_data_scale_mm, x='merge_time', y='cases',label = 'Cases')
bpmmm.set_xlabel('Date',fontsize=12)
bpmmm.set_ylabel('Rides/cases per month',fontsize=12)
plt.show()
From the above graph, we can see that during the start of the pandemic (Between 2020 Feb to 2020 May) there is a decrease in ridership, however, after May of 2020 we can see that as the pandemic cases go down, ridership increased until 2020 September, where the second wave hit Toronto thus cases went up with a decrease in ridership.
Ihis initial oberservation could imply that pandemic has influenced overall ridership, however, upon further examination with previous year data, we can also see that for 2018 and 2019 without pandemic, between May to September, there is an increase in ridership possibly due to summer weather conditions, this trend decreases around September due to winter conditions.
With both factors considered we can conclude that pandemic indeed has influenced ridership, with a steeper rate of increase when pandemic rate decreased and vice versa, however, the extend of this effect can be argued as not as significant due to weather conditions influencing user behaviour.
trips_raw = pd.read_csv('trips_raw_data.csv')
trips_raw.info()
trips_raw.head()
Removing all row except Trip ID, Trip Duration, Start Time and User Type
trips_raw = trips_raw[['Trip Id', 'Trip Duration', 'Start Time', 'User Type']]
Converting Start Time Column to a datetime format
trips_raw ['Start Time'] = pd.DatetimeIndex(trips_raw ['Start Time'])
Analyzing how many different user types there are
unique_subscriptions = trips_raw['User Type'].unique()
unique_subscriptions
Removing all nan from the User Type columns
trips_raw = trips_raw[~trips_raw['User Type'].isna()]
Determinging the time range for the data
min_date = trips_raw['Start Time'].min()
max_date = trips_raw['Start Time'].max()
print(min_date)
print(max_date)
Looks like before 2018-01-01, user types were not tracked. Also, last 21 days of December 2020 are missing from the data set. December will be removed from reach year to ensure we are comparing the same amounts of time.
trips_raw = trips_raw[trips_raw['Start Time'].dt.month!=12]
Split data set into casual and annual users
casual_user = trips_raw[trips_raw['User Type'] == 'Casual Member']
annual_user = trips_raw[trips_raw['User Type'] == 'Annual Member']
Grouping data by year
casual_user['Year'] = casual_user['Start Time'].dt.year
annual_user['Year'] = annual_user['Start Time'].dt.year
number_casual_users = casual_user.groupby(['Year']).size()
number_annual_users = annual_user.groupby(['Year']).size()
number_casual_users
number_annual_users
fig, ax = plt.subplots()
p1 = ax.barh(np.arange(3)+0.4, number_casual_users, 0.4, align = 'center')
p2 = ax.barh(np.arange(3), number_annual_users, 0.4, align = 'center')
ax.set_yticklabels(number_casual_users.index)
ax.set_yticks(np.arange(3)+0.4/2)
ax.xaxis.grid(True)
ax.yaxis.grid(False)
ax.set_xlabel('Numebr of Trips (January to November)')
ax.set_ylabel('Year')
ax.set_title('Number of Casual User Trips vs. Annual User Trips from 2018 to 2020')
fig.set_size_inches(8,5)
ax.legend((p1[0], p2[0]), ('Casual Users', 'Annual Users'),
loc='best')
The amount of casual users has been steadily increasing. The amount of annual users has been fluctuating.
trips = pd.read_csv('trips_raw_data.csv')
trips_raw = trips.copy()
trips_raw.info()
trips_raw.head()
Removing all row except Trip ID, Trip Duration, Start Time and User Type
trips_raw = trips_raw[['Trip Id', 'Trip Duration', 'Start Time', 'User Type']]
Converting Start Time Column to a datetime format
trips_raw ['Start Time'] = pd.DatetimeIndex(trips_raw ['Start Time'])
Analyzing how many different user types there are
unique_subscriptions = trips_raw['User Type'].unique()
unique_subscriptions
Removing all nan from the User Type columns
trips_raw = trips_raw[~trips_raw['User Type'].isna()]
Determinging the time range for the data
min_date = trips_raw['Start Time'].min()
max_date = trips_raw['Start Time'].max()
print(min_date)
print(max_date)
Looks like before 2018-01-01, user types were not tracked. Also, last 21 days of December 2020 are missing from the data set. Free ride Wednesdays are only during September. Only keep data from September of each year
trips_raw = trips_raw[trips_raw['Start Time'].dt.month==9]
Split data set into casual and annual users
casual_user = trips_raw[trips_raw['User Type'] == 'Casual Member']
annual_user = trips_raw[trips_raw['User Type'] == 'Annual Member']
Grouping data by weekday
casual_user['Day'] = casual_user['Start Time'].dt.weekday
annual_user['Day'] = annual_user['Start Time'].dt.weekday
number_casual_users = casual_user.groupby(['Day']).size()
number_annual_users = annual_user.groupby(['Day']).size()
number_casual_users
number_annual_users
fig, ax = plt.subplots()
p1 = ax.barh(np.arange(7)+0.4, number_casual_users/1000, 0.4, align = 'center')
p2 = ax.barh(np.arange(7), number_annual_users/1000, 0.4, align = 'center')
ax.set_yticklabels(number_casual_users.index)
ax.set_yticks(np.arange(7)+0.4/2)
ax.xaxis.grid(True)
ax.yaxis.grid(False)
Weekdays = ['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
plt.yticks(range(7), Weekdays)
ax.set_xlabel('Numebr of Trips (1000s)')
ax.set_ylabel('Weekday')
ax.set_title('Free Ride Wednesday Popularity, 2017 - 2020 September Only, Casual Vs. Annual Users')
ax.legend((p1[0], p2[0]), ('Casual Users', 'Annual Users'),
loc='best')
There is a spike on free ride wednesday. However, the weekends are still more popular for casual users. Annual users are most likey using the bike share for their commutes, rather than for leasure (like the casual users).
Removing all row except Trip ID, Trip Duration, Start Time and User Type
trips_raw = trips # copied from before
trips_raw = trips_raw[['Trip Id', 'Trip Duration', 'Start Time', 'User Type']]
Converting Start Time Column to a datetime format
trips_raw ['Start Time'] = pd.DatetimeIndex(trips_raw ['Start Time'])
Analyzing how many different user types there are
unique_subscriptions = trips_raw['User Type'].unique()
unique_subscriptions
Removing all nan from the User Type columns
trips_raw = trips_raw[~trips_raw['User Type'].isna()]
Determinging the time range for the data
min_date = trips_raw['Start Time'].min()
max_date = trips_raw['Start Time'].max()
print(min_date)
print(max_date)
Looks like before 2018-01-01, user types were not tracked. Also, last 21 days of December 2020 are missing from the data set. Free ride Wednesdays are only during September. Only keep data from September of each year
trips_raw = trips_raw[trips_raw['Start Time'].dt.month!=12]
Split data set into casual and annual users
casual_user = trips_raw[trips_raw['User Type'] == 'Casual Member']
annual_user = trips_raw[trips_raw['User Type'] == 'Annual Member']
Adding column to dataframe to indicate the year, month, week, weekday and hour the trip took place
casual_user['Year'] = casual_user['Start Time'].dt.year
annual_user['Year'] = annual_user['Start Time'].dt.year
casual_user['Month'] = casual_user['Start Time'].dt.month
annual_user['Month'] = annual_user['Start Time'].dt.month
casual_user['Week'] = casual_user['Start Time'].dt.week
annual_user['Week'] = annual_user['Start Time'].dt.week
casual_user['Weekday'] = casual_user['Start Time'].dt.weekday
annual_user['Weekday'] = annual_user['Start Time'].dt.weekday
casual_user['Day'] = casual_user['Start Time'].dt.day
annual_user['Day'] = annual_user['Start Time'].dt.day
casual_user['Hour'] = casual_user['Start Time'].dt.hour
annual_user['Hour'] = annual_user['Start Time'].dt.hour
casual_user.head()
annual_user.head()
Grouping data by week over 2018 to 2020. Dividing by 3 to get average number of trips for that week of the year.
number_casual_users_month = (casual_user.groupby(['Month']).size())/3
number_annual_users_month = (annual_user.groupby(['Month']).size())/3
Plotting weekly distribution (average week over the 3 years)
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_month/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_month/1000, linewidth = 3)
ax.yaxis.grid(True)
ax.xaxis.grid(False)
Months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',]
plt.xticks(range(12), Months)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Week of the Year')
ax.set_title('Average Monthly Users from 2018 - 2020')
ax.legend((p1[0],p2[0]), ('Casual Users','Annual Users'),
loc='best')
Grouping daily data over the 7 days of the week
number_casual_users_day = (casual_user.groupby(['Weekday']).size())/3
number_annual_users_day = (annual_user.groupby(['Weekday']).size())/3
Plotting daily distribution over the week (average day over the 3 years)
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_day/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_day/1000, linewidth = 3)
ax.yaxis.grid(True)
ax.xaxis.grid(False)
Weekdays = ['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
plt.xticks(range(7), Weekdays)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Day of the Week')
ax.set_title('Average Daily Users over the Week from 2018 - 2020')
ax.legend((p1[0],p2[0]), ('Casual Users','Annual Users'),
loc='best')
Grouping hourly data over the day
number_casual_users_Hour = (casual_user.groupby(['Hour']).size())/3
number_annual_users_Hour = (annual_user.groupby(['Hour']).size())/3
Plotting hourly distribution over the day (average day over the 3 years)
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_Hour/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_Hour/1000, linewidth = 3)
ax.yaxis.grid(True)
ax.xaxis.grid(False)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Hour of the Day')
ax.set_title('Average Hourly Users over the Day from 2018 - 2020')
ax.legend((p1[0],p2[0]), ('Casual Users','Annual Users'),
loc='best')
fig.set_size_inches(8,5)
plt.show()
Since most of the cycling demand is in the summner months only holidays in May, June, July and August will be analyzied. Additionally, we will only be looking at holidays in 2019.
# Remove all years except 2019
casual_users_may_to_aug_2019 = casual_user[casual_user['Year']==2019]
annual_users_may_to_aug_2019 = annual_user[annual_user['Year']==2019]
# Remove all months except May
casual_users_may_2019 = casual_users_may_to_aug_2019[casual_users_may_to_aug_2019['Month'] == 5]
annual_users_may_2019 = annual_users_may_to_aug_2019[annual_users_may_to_aug_2019['Month'] == 5]
# Remove all months except June
casual_users_jun_2019 = casual_users_may_to_aug_2019[casual_users_may_to_aug_2019['Month'] == 6]
annual_users_jun_2019 = annual_users_may_to_aug_2019[annual_users_may_to_aug_2019['Month'] == 6]
# Remove all months except July
casual_users_jul_2019 = casual_users_may_to_aug_2019[casual_users_may_to_aug_2019['Month'] == 7]
annual_users_jul_2019 = annual_users_may_to_aug_2019[annual_users_may_to_aug_2019['Month'] == 7]
# Remove all months except Aug
casual_users_aug_2019 = casual_users_may_to_aug_2019[casual_users_may_to_aug_2019['Month'] == 8]
annual_users_aug_2019 = annual_users_may_to_aug_2019[annual_users_may_to_aug_2019['Month'] == 8]
Grouping daily data over May
number_casual_users_day = casual_users_may_2019.groupby(['Day']).size()
number_annual_users_day = annual_users_may_2019.groupby(['Day']).size()
Plotting daily data over May
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_day/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_day/1000, linewidth = 3)
mohters_day = plt.axvline(x = 12, color = 'red')
vitoria_day = plt.axvline(x = 20, color = 'green')
ax.yaxis.grid(True)
ax.xaxis.grid(False)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Day of Month')
ax.set_title('Daily Trips in May 2019')
ax.legend((p1[0],p2[0], mohters_day, vitoria_day), ('Casual Users','Annual Users', 'Mothers Day', 'Victoria Day'),
loc='best')
Grouping daily data over June
number_casual_users_day = casual_users_jun_2019.groupby(['Day']).size()
number_annual_users_day = annual_users_jun_2019.groupby(['Day']).size()
Plotting daily data over June
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_day/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_day/1000, linewidth = 3)
fathers_day = plt.axvline(x = 16, color = 'red')
ax.yaxis.grid(True)
ax.xaxis.grid(False)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Day of Month')
ax.set_title('Daily Trips in June 2019')
ax.legend((p1[0],p2[0], fathers_day), ('Casual Users','Annual Users', 'Fathers Day'),
loc='best')
Grouping daily data over July
number_casual_users_day = casual_users_jul_2019.groupby(['Day']).size()
number_annual_users_day = annual_users_jul_2019.groupby(['Day']).size()
Plotting daily data over July
fig, ax = plt.subplots()
p1 = ax.plot(number_casual_users_day/1000, linewidth = 3)
p2 = ax.plot(number_annual_users_day/1000, linewidth = 3)
canada_day = plt.axvline(x = 1, color = 'red')
ax.yaxis.grid(True)
ax.xaxis.grid(False)
ax.set_ylabel('Numebr of Trips (1000s)')
ax.set_xlabel('Day of Month')
ax.set_title('Daily Trips in July 2019')
ax.legend((p1[0],p2[0], canada_day), ('Casual Users','Annual Users', 'Canada Day'),
loc='best')
plt.rcParams['figure.dpi'] = 450
trips_raw = pd.read_csv('trips_raw_data.csv')
trips_raw.head()
Lets see which neighbourhoods have the highest departures and arrivals from the bike share data.
First, lets use the Neighbourhoods file from the City of Toronto Open Data. The City has 140 designated neighbourhoods, and they are composed of a couple census tracts.
We're using the geojson version since that is easier to work with than managing multiple files needed for the a shapefile.
neighbourhoods = gpd.read_file('Neighbourhoods.geojson')
neighbourhoods.head(5)
In this step, I'll rename some columns, and remove the neighbourhood id from the name string using some regex commands. We basically are removing cases where there are 1, 2, or 3 numerical digits from the string.
We'll still keep the the neighbourhood id, as its better to use that as our unique identifier.
neighbourhoods = neighbourhoods.rename(columns = {'AREA_NAME':'name', 'AREA_SHORT_CODE':'area_short'})
neighbourhoods['name'] = neighbourhoods['name'].str.replace(' \(\d\d\)', '').str.replace(' \(\d\d\d\)', '').str.replace(' \(\d\)', '')
neighbourhoods = neighbourhoods[['area_short','name', 'geometry']]
In a spreadsheet, I went ahead and manually classified each neighbourhood into a higher level of geography. I'm using the pre-1998 Metro Toronto Borough/City boundaries to classify neighbourhoods into North York, Scarborough, York, East York, and Etobicoke. This process was easy, since neighbourhood boundaries (and census tracts) line up with the former Metro Toronto boundaries, and the area_short parameter is numerically grouped into the former cities/boroughs.
I further subdivided neighbourhoods located in the former Old Toronto. For Downtown, I used the TOcore definition of Downtown. From their, the East End is everything east of the Don River, the West End is generally west of Bathurst, and the North End is generally everything north of Bloor Street. The Annex, and Wychwoods neighbourhoods were placed in the North End, and those neighbourhoods formed the boundary between the North and West ends.
neighbourhoods = neighbourhoods.merge(pd.read_csv('neighbourhood_groupings.csv'))
neighbourhoods.head()
To make the maps look more recognizable to the average person, and for easy distance calculations, we're going to use a mercator projection. EPSG = 26917 corresponds to UTM zone 17N, which is the associated zone for Toronto.
neighbourhoods = neighbourhoods.to_crs(epsg = '26917')
We'll also rotate our geography 17 degrees, so that Yonge Street is a vertical line. This also makes it easier to read, and this decision is frequently used in official city publications, such as TTC maps, or official City of Toronto Maps.
for index, row in neighbourhoods.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
neighbourhoods.at[index, 'geometry'] = rotated
Let's make a quick map of our borough groupings. I'll use the dissolve method for GeoDataFrames to group neighbourhoods by boroughs and dissolve their borders.
boroughs = neighbourhoods.dissolve(by = 'Borough')
city_boundary = gpd.geoseries.GeoSeries([neighbourhoods.unary_union]) # to create a single polygon representing Toronto
city_boundary.to_file('to_boundary.geojson', driver = 'GeoJSON')
I'm going to create a centroid column. This will make it easier to plot labels, as the labels will be over the centroid.
boroughs['centroid'] = boroughs['geometry'].apply(lambda x: x.centroid.coords[:])
boroughs['centroid'] = [i[0] for i in boroughs['centroid']]
boroughs = boroughs.reset_index()
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredDirectionArrows # for north arrow
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
boroughs.plot(cmap = 'tab10', ax = ax)
#north arrow
x, y, arrow_length = 0.85, 0.4, 0.15
ax.annotate('N', xy=(x + 0.15 * math.sin(math.radians(17)), y), xytext=(x, y- 0.15 * math.cos(math.radians(17))),
arrowprops = dict(facecolor='black', width=1, headwidth=6),
ha='center', va='center', fontsize=10,
xycoords=ax.transAxes)
plt.suptitle('Boroughs used for the BikeShare Project', fontsize = 12, fontweight = 'bold', y = 0.94)
ax.set_title('Based on pre-1998 borders of Toronto', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_axis_off()
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25)) # scalebar
for index, row in boroughs.iterrows(): #iteratively plots labels
plt.annotate(s=row['Borough'], xy=row['centroid'], ha = 'center', fontsize = 8)
Now lets join the bikeshare stations locations to a neighbourhood and borough.
stations = pd.read_csv('bikeshare_stations.csv')
First we need to create a GeoDataFrame using the lat and lon found in the csv. Then we need to convert it to a UTM projection and rotate the geometry by 17 degrees.
stations_gdf = gpd.GeoDataFrame(stations, geometry =
gpd.points_from_xy(stations.lon, stations.lat)).set_crs(epsg = '4326').to_crs(epsg = '26917')
for index, row in stations_gdf.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
stations_gdf.at[index, 'geometry'] = rotated
To spatially join the datasets, we'll use Geopandas' sjoin function to return which neighbourhood a bike share station is in.
stations_gdf = gpd.sjoin(neighbourhoods, stations_gdf)
First, let's count the number of departures by start stations. To reduce the complexity of the data, we can avoid drop the other columns. We'll name the count column as Trips.
trips_origin = trips_raw.groupby('Start Station Id').count()[['Trip Id']]
trips_origin = trips_origin.reset_index()
trips_origin = trips_origin.rename(columns = {'Trip Id': 'Trips'})
trips_origin.head()
We'll use left join the dataset to the lookup table with the neighbourhoods and station Ids. Here we're doing two left joins, one with the GeoDataFrame of the stations, then with the GeoDataFrame of the neighbourhoods to make sure that when we plot the data, it will plot the neighbourhoods.
Then, we need to do another groupby aggregation (using the sum function) to sum up the trip origins by neighbourhood.
trips_origin_gdf = stations_gdf.merge(trips_origin, right_on = 'Start Station Id', left_on = 'Station Id', how = 'left')
trips_origin_gdf['Trips'] = trips_origin_gdf['Trips'].fillna(0)
trips_origin_gdf = neighbourhoods.merge(trips_origin_gdf.groupby('area_short').sum()[['Trips']].reset_index(), how = 'left')
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1) #to clean up the chloropleth legend
trips_origin_gdf.plot(column = 'Trips', cmap = 'YlGnBu', ax = ax, legend = True,
zorder = 1, edgecolor = 'k', linewidth = 0.2,
legend_kwds={'label': "Trips Since 2017"}, cax = cax, vmin = 0, vmax = 1800000)
city_boundary.plot(ax = ax, zorder = 0, color = 'xkcd:light grey') #any neighbourhood without a bike share trip will be grey
#north arrow
x, y, arrow_length = 0.85, 0.4, 0.15
ax.annotate('N', xy=(x + 0.15 * math.sin(math.radians(17)), y), xytext=(x, y- 0.15 * math.cos(math.radians(17))),
arrowprops = dict(facecolor='black', width=1, headwidth=6),
ha='center', va='center', fontsize=10,
xycoords=ax.transAxes)
plt.suptitle('BikeShare origins are concentrated downtown', fontsize = 12, fontweight = 'bold', y = 0.93)
ax.set_title('Chloropleth map of trip origins since 2017, by neighbourhood', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_axis_off()
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
The data shows that the trips are strongly concentrated downtown. Outside of downtown, there are some activity in neighbourhoods along the Waterfront, specifically the Martin Goodman Trail (such as South Parkdale, and Humber Bay Shores).
Let's make a bar chart to find out the names of the top 10 neighbourhoods for bikeshare trip origins.
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
top_10_origins = trips_origin_gdf.sort_values(by = 'Trips', ascending = False).head(10)
sns.barplot(y = 'name', x = 'Trips', data = top_10_origins, ax = ax, orient = 'h', hue = 'Borough', dodge = False)
ax.set_yticks(range(10))
ax.set_xticks(range(0,1750000,250000))
ax.set_ylabel('Neighbourhood')
ax.set_xlabel('Trips')
ax.set_title('Number of trips since 2017, by neighbourhood', fontsize = 10)
plt.suptitle('Downtown is the biggest source of trips', fontweight = 'bold', fontsize = 12)
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
ax.set_yticklabels(top_10_origins['name'])
plt.show()
Again a similar story with the map. The non-downtown neighbourhoods that show up on this list are neighbourhoods that border the lake.
Whats not on this chart are other downtown neighbourhoods, such as St Jamestown, Cabbagetown, and Regent Park. This may suggest that income might play a factor in determining the number of bikeshare trips a neighbourhood has.
Now let's repeat all the same steps above, but for trip destinations. This means we'll group by the End Station Id before joining.
trips_dest = trips_raw.groupby('End Station Id').count()[['Trip Id']]
trips_dest = trips_dest.reset_index()
trips_dest = trips_dest.rename(columns = {'Trip Id': 'Trips'})
trips_dest_gdf = stations_gdf.merge(trips_dest, right_on = 'End Station Id', left_on = 'Station Id', how = 'left')
trips_dest_gdf = neighbourhoods.merge(trips_dest_gdf.groupby('area_short').sum()[['Trips']].reset_index(), how = 'left')
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1) #to clean up the chloropleth legend
trips_dest_gdf.plot(column = 'Trips', cmap = 'YlGnBu', ax = ax, legend = True,
zorder = 1, edgecolor = 'k', linewidth = 0.2,
legend_kwds={'label': "Trips Since 2017"}, cax = cax, vmin = 0, vmax = 1800000)
city_boundary.plot(ax = ax, zorder = 0, color = 'xkcd:light grey') #any neighbourhood without a bike share trip will be grey
#north arrow
x, y, arrow_length = 0.85, 0.4, 0.15
ax.annotate('N', xy=(x + 0.15 * math.sin(math.radians(17)), y), xytext=(x, y- 0.15 * math.cos(math.radians(17))),
arrowprops = dict(facecolor='black', width=1, headwidth=6),
ha='center', va='center', fontsize=10,
xycoords=ax.transAxes)
plt.suptitle('BikeShare destinations are also concentrated downtown', fontsize = 12, fontweight = 'bold', y = 0.93)
ax.set_title('Chloropleth map of trip destinations since 2017, by neighbourhood', fontsize = 8)
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
top_10_origins = trips_dest_gdf.sort_values(by = 'Trips', ascending = False).head(10)
sns.barplot(y = 'name', x = 'Trips', data = top_10_origins, ax = ax, orient = 'h', hue = 'Borough', dodge = False)
ax.set_yticks(range(10))
ax.set_xticks(range(0,2000000,250000))
ax.set_ylabel('Neighbourhood')
ax.set_xlabel('Trips')
ax.set_title('Number of trips since 2017, by neighbourhood', fontsize = 10)
plt.suptitle('A similar concentration in downtown exists for destinations', fontweight = 'bold', fontsize = 12)
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
ax.set_yticklabels(top_10_origins['name'])
plt.show()
Now let's to see if there are any imbalances between the origins and destinations. To do this, we'll merge the origin and destination tables together, then get the relative difference of the trips by taking
$ (origins - destinations)/(origins)*100\% $
A negative value would indicate the neighbourhood receives more destinations than origins, while a positive value would indicate more origns than destinations.
trips_neighbourhood = trips_origin_gdf.merge(trips_dest_gdf, on = ['area_short', 'name', 'geometry', 'Borough'],
suffixes = ['_origin', '_destination'])
trips_neighbourhood['delta_perc'] = (trips_neighbourhood['Trips_origin'] - trips_neighbourhood['Trips_destination'])/trips_neighbourhood['Trips_origin']
trips_neighbourhood['delta_perc'].min(), trips_neighbourhood['delta_perc'].max()
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1) #to clean up the chloropleth legend
trips_neighbourhood.plot(column = 'delta_perc', cmap = 'PiYG', ax = ax, legend = True,
zorder = 1, edgecolor = 'k', linewidth = 0.2,
legend_kwds={'label': "Relative Difference Fraction"}, cax = cax, vmin = -0.3, vmax = 0.3)
city_boundary.plot(ax = ax, zorder = 0, color = 'xkcd:light grey') #any neighbourhood without a bike share trip will be grey
#north arrow
x, y, arrow_length = 0.85, 0.4, 0.15
ax.annotate('N', xy=(x + 0.15 * math.sin(math.radians(17)), y), xytext=(x, y- 0.15 * math.cos(math.radians(17))),
arrowprops = dict(facecolor='black', width=1, headwidth=6),
ha='center', va='center', fontsize=10,
xycoords=ax.transAxes)
plt.suptitle('Users starting in Midtown are likely not making return trips', fontsize = 12, fontweight = 'bold', y = 0.93)
ax.set_title('Relative difference of origins to destinations, by neighbourhood', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_axis_off()
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
From the map, we can see that North End/Midtown neighbourhoods have more origins than destinations. We can probably infer that users starting in the North End are travelling south to downtown, the lakeshore, or the beaches, and do not want to bike back uphill.
We can see that the topography of the city plays a factor in Bikeshare trips.
Now that we know the origin and destination of Bike Share trips, lets see where they are going. To avoid having a 140x140 matrix, we'll use the borough definitions from earlier.
Lets merge the trips dataset with stations_gdf to assign which borough a trip starts at.
trips_chord = trips_raw.merge(stations_gdf, left_on = 'Start Station Id', right_on = 'Station Id')
trips_chord = trips_chord.rename(columns = {'area_short':'origin_area_short', 'Borough': 'Start Borough'})
trips_chord = trips_chord[['origin_area_short', 'Trip Id', 'End Station Id', 'Start Borough']]
trips_origin_gdf['Trips'] = trips_origin_gdf['Trips'].fillna(0)
We'll also repeat the process for the destinations.
trips_chord = trips_chord.merge(stations_gdf, left_on = 'End Station Id', right_on = 'Station Id')
trips_chord = trips_chord.rename(columns = {'area_short':'dest_area_short', 'Borough': 'End Borough'})
trips_chord = trips_chord[['Trip Id', 'Start Borough', 'End Borough']]
trips_chord.head()
Now let's perform a groupby action, using count() as the aggregation method.
trips_chord = trips_chord.groupby(['Start Borough', 'End Borough']).count()
trips_chord = trips_chord.reset_index()
trips_chord = trips_chord.rename(columns = {'Trip Id': 'Trips'})
To convert to an origin-destination matrix, we'll use pivot function found in pandas.
Traditionally in transportation, origins are represented by rows, while destinations are columns.
trips_matrix = trips_chord.pivot(index = 'Start Borough', columns = 'End Borough', values = 'Trips').fillna(0)
trips_matrix
To make a chord diagram, we'll need to install a package that uses matplotlib to easily make chord diagrams.
As an inital visualizers, chord diagrams can convey the flow a lot easier than a heatmap, which needs annotations to be useful, and may be overwhelming for non-technical audiences.
!pip install mpl-chord-diagram
from mpl_chord_diagram import chord_diagram
#manually defining names so that certain boroughs are on 2 lines.
names = ['Downtown', 'East\nEnd', 'East\nYork', 'Etobicoke', 'North\nEnd',
'North\nYork', 'Scarborough', 'West\nEnd', 'York']
fig, ax = plt.subplots()
plt.suptitle('Most Trips Are Within Downtown, or Old Toronto', x = 0.249, y = 0.99, ha = 'left', fontsize = 10, fontweight = 'bold')
ax.text(s = 'Chord diagram of origin and destination of BikeshareTO trips, by Borough', x = 0.03, y = 1.6, ha = 'center', fontsize = 6)
chord = chord_diagram(trips_matrix, names = names, cmap = 'Set1', ax = ax, fontsize = 8, rotate_names = 90,
order = [0,2,1,3,4,5,7,6,8], pad = 4, use_gradient = True)
Here, we can easily see that Downtown to Downtown trips make up close to half of the trips. The other Old Toronto boroughs have a significant amount trips, and roughly half of their trips go to/come from downtown. Inter-borough trips are also a significant source of trips for all boroughs.
Of the remaining boroughs, Etobicoke (again likely due to the Martin Goodman Trail) looks to be the busiest, while York and North York are among the bottom for trips. This is likely because Scarborough has signifcant activity.
Another way to visualize the flow between regions of Toronto is to make a heat map. We'll convert the data in the matrix to ratios/percentages, and then uses sns.heatmap to make the graph.
order = ['Downtown', 'West End', 'East End', 'North End', 'East York', 'Etobicoke',
'North York', 'Scarborough', 'York']
trips_matrix = trips_matrix.reindex(index=order, columns=order)
trips_matrix_perc = trips_matrix/trips_matrix.sum().sum()
ax = sns.heatmap(trips_matrix_perc, fmt = '.1%', annot = True, cmap="YlGnBu", annot_kws={"fontsize":8})
ax.text(s = 'Intra-Downtown trips constitute half of all trips', fontweight = 'bold', y = -0.70, x = 0, ha = 'left')
ax.text(s = 'Heatmap representation of BikeShare Toronto origin-destination matrix, by borough', fontsize = 7, y = -0.25, x = 0, ha = 'left')
The advantage of the heat map is that we can clearly see the numeric values that the chord diagram represents. Downtown clearly dominates the number of trips, so in order to visualize the data in other boroughs, we'll take out intra-downtown trips, recalculate the ratios, and make a new heatmap.
trips_matrix_mod = trips_matrix
trips_matrix_mod.iloc[0,0] = None
trips_matrix_perc_mod = trips_matrix_mod/trips_matrix_mod.sum().sum()
ax = sns.heatmap(trips_matrix_perc_mod, fmt = '.1%', annot = True, cmap="YlGnBu", annot_kws={"fontsize":8})
ax.text(s = 'West End Toronto is another center of BikeShare trips', fontweight = 'bold', y = -0.70, x = 0, ha = 'left')
ax.text(s = 'Heatmap representation of BikeShare Toronto origin-destination matrix, by borough', fontsize = 7, y = -0.25, x = 0, ha = 'left')
Similar to the chord diagram, we see that trips in Old Toronto capture the rest of the BikeShare trips, and there is very limited activity in the suburbs. Of the Old Toronto neighbourhoods, West End neighbourhoods has the highest activity, which could be influenced by the Martin Goodman trail, the presence of the Bloor Bike Lane, and generally more bikelane infrastructure in the West End.
One question that we would like to answer is which roads are bikeshare users using?
There are 2 easy ways to analyze this:
Method 2 would produce a more real result, that is less abstract, so we'll use this method of finding the busiest road segment from bikeshare users.
networkx¶We can use a package called networkx. networkx uses a method called graph or network theory, which is a representation of a network using nodes and links. We can then use a built in shortest path algorithm to find the path a trip takes in the network.
Below is an example graph that has been randomly created.
# Create a random graph with 10 graph, and probability an edge exists is 0.4
test = nx.fast_gnp_random_graph(10, 0.4, 0)
nx.draw(test)
For our application, the edges/links are the streets/trails, and the nodes are the intersections.
The City of Toronto has a open data spatial layer, the centreline, that contains the individual road segments of the city. Each segment is approximately 100m long, although this greatly varies since it depends on where an intersection is.
The centreline layer is linked directly to the intersections layer that the City of Toronto also has on their open data portal. The FNODE and TNODE columns refer to the INT_ID of the intersections.
centreline = gpd.read_file('centreline.geojson')
intersections = gpd.read_file('intersections/CENTRELINE_INTERSECTION_WGS84.shp')
centreline.head()
For our analysis, we will use the following columns from the centreline:
GEO_ID - the unique identifier for a segmentLFN_ID and LFN_NAME - the identifier and string name of the streetFNODE and TNODE - the INT_ID that corresponds to the intersections in the intersections layerFCODE_DESC - the road classificationgeometrycentreline = centreline[['GEO_ID', 'LFN_ID', 'LF_NAME', 'FNODE', 'TNODE', 'FCODE_DESC', 'geometry']]
For all the spatial layers, I'll be converting them to UTM ZONE 17N projection, and rotating the layer 17 degrees so Toronto is perpendicular with the map boundary.
centreline = centreline.to_crs(epsg = '26917')
for index, row in centreline.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
centreline.at[index, 'geometry'] = rotated
The default way for a shortest path algorithm to work is to count the number of links, but we can specify a different weight to find the shortest path. We'll calculate a length column (length in meters) for networkx to use in its shortest path calculation.
centreline['length'] = centreline.length
Many different road classes exist in the centreline layer.
centreline['FCODE_DESC'].unique()
We obviously don't want to accidentally route trips over a path that might take them over the Gardiner expressway. The centreline layer also maps the "centre lines" of other features such as pre-1998 municipal boundaries. rivers, railways, former roads, and hydro lines. We'll need to subset the centreline to only use links that bikeshare users will be able to physically ride on.
Note: Much of my knowledge on city provided spatial layers comes from the fact that I worked at the City of Toronto during PEY.
valid_roads = ['Local', 'Collector',
'Major Arterial',
'Minor Arterial',
'Trail', 'Walkway']
centreline_valid = centreline[centreline['FCODE_DESC'].isin(valid_roads)]
centreline_valid
While we're at it, lets also load in the bikeways layer, also from Toronto Open Data.
bikeways = gpd.read_file('bikeways.geojson')
bikeways = bikeways.to_crs(epsg = '26917')
for index, row in bikeways.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
bikeways.at[index, 'geometry'] = rotated
bikeways.info()
It appears 4 columns refer to the level of bike infrastructure a path has, however these 4 columns don't always agree. My sense is that the bikeways layer was created from the centreline layer where the segments are combined into an individual road instead of a road segment. This had the effect of combining road segments with different bike infrastructure, and these 4 columns were the byproduct of that.
The INFRA_HIGHORDER column, from manual inspection, seems to be the most correct, so we'll use that column as the column that tells us the bike infrastructure a link has.
bikeways[['ORIG_LOWORDER_INFRA', 'INFRA_HIGHORDER','INFRA_LOWORDER', 'ORIG_HIGHORDER']]
To combine the centreline and bikeways layer, we'll have to follow a couple of steps:
contains operation on the centreline. This will capture all segments within that buffer, and assign them to a bikeway, making a lookup tablebikeways_buffer = bikeways
bikeways_buffer['geometry'] = bikeways_buffer.buffer(10)
bikeway_lookup = gpd.sjoin(bikeways_buffer, centreline_valid, how = 'left', op = 'contains')[['GEO_ID','INFRA_HIGHORDER']]
centreline_valid = centreline_valid.merge(bikeway_lookup, how = 'left')
centreline_valid
The entries for INFRA_HIGHORDER are not cleaned, and many labels are roughly similar to each other. We'll need to do some manual cleaning of this column.
centreline_valid['INFRA_HIGHORDER'].unique()
centreline_valid['FCODE_DESC'].unique()
An additional complication is that while it may be easy to assume that all bikeshare users will use bikeways, this is obviously not the case, and they may choose to ride on regular arterials if they are comfortable.
Cyclists also have different levels of preference. Most cyclists will choose a cycle-track (the city defines them as bikelanes with physical separation), over a road with sharrows, unless it is a huge detour.
We'll use the concept of generalized cost to route our trips. Instead of finding the shortest length path, it will find the path with the least cost (or greatest utility). A path thats on a bike lane will cost less and bring a cyclist greater utility than a path on a busy arterial.
We'll use the length of a road segment as a starting point for our cost, then factor the length based on how willing they are to choose each type of road over another in their route planning. This means that on a map, cyclists will be willing to take a detour if it means that they can ride on a trail or a bike lane, but only to a certain threshold, and are willing to use an arterial if its a lot more direct than other options.
One study out of UBC conducted a survey of riders for their route preference and the type of bike infrastructure.
from IPython.display import Image
Image('route_preference.png')
We'll use a modified version of this table to factor our lengths in the column length_fac. For each type of road, we'll set the factor to:
$factor = (1 - score)$
We'll use a bunch of np.where statements to set the factor if it meets each conditional.
centreline_valid['length_fac'] = None
# multiuse paths
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Multi-Use Trail - Entrance',
'Multi-Use Trail','Park Road',
'Multi-Use Trail - Boulevard',
'MUT (2016 Network Plan/2012 Trails Plan)',
'Multi-Use Trail (Local)',
'MUT - Entrance (2016 Network Plan/2012 Trails Plan)'])
, 0.5, centreline_valid['length_fac'])
# catching errors in the where the Martin Goodman Trail bikeway does not exactly overlap with the centreline
centreline_valid['length_fac'] = np.where(centreline_valid['FCODE_DESC'].isin(['Trail'])
, 0.5, centreline_valid['length_fac'])
# all separated bike paths or cycle tracks
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Contraflow Cycle Track',
'Cycle Track - Contraflow',
'Cycle Track'])
, 0.6, centreline_valid['length_fac'])
# residential roads
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isnull()
& centreline_valid['FCODE_DESC'].isin(['Local', 'Collector', 'Walkway']),
0.9, centreline_valid['length_fac'])
# residential roads signed as bike routes
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Signed Route (No Pavement Markings)',
'Sharrows - Wayfinding',
'Sharrows'])
& centreline_valid['FCODE_DESC'].isin(['Local', 'Collector']), 0.7, centreline_valid['length_fac'])
# residential bike lanes
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Bike Lane',
'Bike Lane - Buffered',
'Contraflow Bike Lane',
'Bike Lane - Contraflow'])
& centreline_valid['FCODE_DESC'].isin(['Local', 'Collector']), 0.6, centreline_valid['length_fac'])
# arterials or major streets
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isnull()
& centreline_valid['FCODE_DESC'].isin(['Major Arterial', 'Minor Arterial']),
1.5, centreline_valid['length_fac'])
# bike lanes on arterials (but not cycle tracks/separated bike lanes)
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Bike Lane',
'Bike Lane - Buffered',
'Contraflow Bike Lane',
'Bike Lane - Contraflow'])
& centreline_valid['FCODE_DESC'].isin(['Major Arterial', 'Minor Arterial']),
0.9, centreline_valid['length_fac'])
# arterials signed as bike routes
centreline_valid['length_fac'] = np.where(centreline_valid['INFRA_HIGHORDER'].isin(['Signed Route (No Pavement Markings)',
'Sharrows - Wayfinding',
'Sharrows', 'Sharrows - Arterial',
'Edge Line'])
& centreline_valid['FCODE_DESC'].isin(['Major Arterial', 'Minor Arterial']),
1.2, centreline_valid['length_fac'])
# lake shore trail was not properly matched in the lookup table, so we'll do some manual adjustments
centreline_valid['length_fac'] = np.where(centreline_valid['LF_NAME'] == 'Lake Shore Blvd East Trl',
0.5, centreline_valid['length_fac'])
Our length_adj column will be the column that represents the cost to the cyclist as they take a path. We'll be using this column instead of length in networkx.
centreline_valid['length_adj'] = centreline_valid['length'] * centreline_valid['length_fac']
intersections.head()
We'll be using these columns from the intersections file:
INT_ID and INTERSEC5 - to identify the intersection and to link it to the centrelineCLASSIFI6 and CLASSIFI7 - the classification of intersectiongeometryintersections = intersections[['INT_ID', 'INTERSEC5', 'CLASSIFI6', 'CLASSIFI7', 'geometry']]
intersections = intersections.to_crs(epsg = '26917')
for index, row in intersections.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
intersections.at[index, 'geometry'] = rotated
While we aren't using these classifications, its interesting to note that like the centreline, the intersections file includes more than just road intersections. Pseudo intersections are ones where a road on an overpass/tunnel crosses another road, while Statistical intersections are when roads cross a jurisdictional boundary.
intersections['CLASSIFI7'].unique()
networkx requires us to feed them a discrete x and y coordinate, so we'll construct this column now.
intersections['X'] = intersections.geometry.x
intersections['Y'] = intersections.geometry.y
intersections
nodes = intersections[['INT_ID', 'X', 'Y']]
networkx¶We'll do a double inner join on the FNODE and TNODE to make sure that any link we send in has an associated intersection in the intersections file.
edges = centreline_valid.merge(nodes, left_on = 'FNODE', right_on = 'INT_ID', how = 'inner')[['FNODE', 'TNODE', 'GEO_ID', 'length_adj']]
edges = edges.merge(nodes, left_on = 'TNODE', right_on = 'INT_ID', how = 'inner')[['FNODE', 'TNODE', 'GEO_ID', 'length_adj']]
edges
edges = edges.drop_duplicates()
Similarly, since we already filtered out a lot of features such as railways and rivers, we'll do another set of inner joins to make sure we only include nodes connecting to a link.
nodes = nodes.merge(edges, left_on = 'INT_ID', right_on = 'FNODE', how = 'inner')[['INT_ID', 'X', 'Y']]
nodes = nodes.merge(edges, left_on = 'INT_ID', right_on = 'TNODE', how = 'inner')[['INT_ID', 'X', 'Y']]
nodes
nodes = nodes.drop_duplicates()
nodes
edges
First we'll create the network, or the graph.
g = nx.Graph()
Then we'll feed in the links. The first 2 arguments are the start and end nodes, the third is the length_adj column, and the last argument is the unique identifier for the link, or the GEO_ID that ties back to the centreline file.
for index, row in edges.iterrows():
g.add_edge(row[0], row[1], length = row[3], key = row[2])
For the nodes, the first argument is the unique identifier, while the second is the X and Y coordinates. We won't be using networkx to draw our network flows since eats too much memory, so the position argument is not as important.
for i, row in nodes.iterrows():
g.add_node(row[0], pos = (row[1], row[2]))
trips = pd.read_csv('trips_raw_data.csv')
trips['Start Time'] = pd.DatetimeIndex(trips['Start Time'])
We'll only be analyzing 2019, since it represents a normal year.
trips = trips[trips['Start Time'].dt.year == 2019]
trips
Since our graph is based on nodes and links, we need to map each bikeshare station to an intersection. Lets load in the bikeshare stations layer as a GeoDataFrame.
stations = pd.read_csv('bikeshare_stations.csv')
stations_gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations.lon, stations.lat))
stations_gdf.set_crs(epsg = '4326', inplace=True)
stations_gdf = stations_gdf.to_crs(epsg = '26917')
for index, row in stations_gdf.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
stations_gdf.at[index, 'geometry'] = rotated
We'll only be mapping the stations to nodes that are in our network, so we'll filter out nodes that aren't connected to any valid links.
intersections_valid = intersections[intersections['INT_ID'].isin(list(centreline_valid['FNODE'].unique()))]
Making the X and Y coordinates discrete columns.
stations_gdf['x'] = stations_gdf.geometry.x
stations_gdf['y'] = stations_gdf.geometry.y
stations_gdf
The first step to mapping the stations to an intersection is to create a cross join. This means each row will be joined to all other rows in both datasets. To do this, we'll create a zero column in both datasets, and join on that column of zeroes.
intersections_valid['key'] = 0
stations_gdf['key'] = 0
cross_join = intersections_valid.merge(stations_gdf[['key', 'Station Id', 'x', 'y']], how='outer')
cross_join
After cross joining the two datasets, we'll calculate the euclidean distance between the stations and the intersections. Since this a cross join, this is being done for every possible combination of station-intersection.
Because we're in a mercator projection, and working on a relatively small area, we can make the assumption that our surface is planar, and use pythagoreas theorem to calculate the distance. This will significantly speed up the calculation rather than relying on a function to process 24 million rows.
cross_join['dist'] = (cross_join['X'] - cross_join['x'])**2 + (cross_join['Y'] - cross_join['y'])**2
To complete the mapping, we'll group by the station ID, and take the lowest distance, before rejoining the data back to the intersections dataset to grab the INT_ID.
intersections_lookup = cross_join.groupby('Station Id').agg({'dist':'min'}).merge(cross_join, left_on = ['dist'], right_on = ['dist'], how = 'inner')
intersections_lookup = intersections_lookup.sort_values(by='dist').drop_duplicates('Station Id', keep = 'first').reset_index()[['INT_ID', 'Station Id']]
Now we can do some joins to map each Station Id in the trips dataset to an INT_ID.
trips = trips[['Start Station Id', 'End Station Id']]
trips = trips.merge(intersections_lookup, left_on = 'Start Station Id', right_on = 'Station Id')
trips = trips.rename(columns = {'INT_ID': 'START INT'})
trips = trips[['Start Station Id', 'End Station Id', 'START INT']]
trips
trips = trips.merge(intersections_lookup, left_on = 'End Station Id', right_on = 'Station Id')
trips = trips.rename(columns = {'INT_ID': 'END INT'})
trips = trips[['START INT', 'END INT']]
To avoid running networkx through 2 million trips, we'll only run networkx through unique origin destination pairs, then multiply the resulting path by the number of trips made for that origin destination pair. We can do that by grouping by both the START INT and END INT.
This brings us down to 112,705 replications that we need to run networkx for.
trip_od = trips.groupby(['START INT', 'END INT']).size().reset_index().rename(columns = {0: 'trips'})
trip_od
trip_od.sort_values(by = 'trips', ascending = False)
networkx to find the path for each OD pair¶Some assumptions this process is making:
trip_od['trips'].sum()
Additionally, a small number of trips (capturing the "touring" behaviour) start and end at the same station. This, and similar trips will not be properly captured in networkx since it will just return an empty path.
trip_od[trip_od['START INT'] == trip_od['END INT']]['trips'].sum()
As an example, lets route the most popular origin-destination pair, from 30102397 to 13468181. This pair has 2043 trips, and is from Queens' Quay and Bathurst, to Queens Quay and York.
The first step is to use the shortest path function. This uses the dijkstra shortest path algorithm to calculate the shortest path, based on our cost column length_adj.
Then, we can create another graph from that path that only contains the nodes and links that were used to construct that path. This is how we can extract the links from the graph.
route = []
most_popular_graph = nx.shortest_path(g, 30102397, 13468181, weight='length') # dijkstra
most_popular_route = nx.path_graph(most_popular_graph)
for ea in most_popular_route.edges():
route.append(g.edges[ea[0], ea[1]]['key'])
centreline_unrotate = gpd.read_file('centreline.geojson')
Now we can grab the associated geometries from the centreline file to map it. The basemap function contextily requires us to use web mercator, so we'll need to use epsg = 3857.
route_poly = centreline_unrotate[centreline_unrotate['GEO_ID'].isin(route)].to_crs(epsg = '3857')
boundary = gpd.read_file('to_boundary.geojson')
plt.rcParams['figure.dpi'] = 450
import contextily as cx
to_basemap = cx.Place("Toronto, Canada")
fig, ax = plt.subplots()
ax.set_title('Queens Quay from Bathurst to York Street is the Busiest Origin-Destination Pair', fontsize = 10, fontweight = 'bold')
fig.set_size_inches(7,4)
route_poly.plot(ax = ax, linewidth = 4)
cx.add_basemap(ax, source=cx.providers.OpenStreetMap.Mapnik)
ax.set_axis_off()
#north arrow
x, y, arrow_length = 0.85, 0.4, 0.15
ax.annotate('N', xy=(x, y), xytext=(x, y- 0.20),
arrowprops = dict(facecolor='black', width=1, headwidth=6),
ha='center', va='center', fontsize=10,
xycoords=ax.transAxes)
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
trip_od = trip_od[trip_od['START INT'] != trip_od['END INT']]
trip_od
Now that we know the process for a single origin-destination pair, we can operationalize this in a loop. For each loop, once we extract the links, we'll assign it to a list that will save the number of trips, and the links used in that trip.
We're using a try/except statement because certain pairs could not be routed and broke the loop. This ensures that even if a trip could not be routed, we can still continue the loop.
start = time.time()
route_table = []
unroutable = []
next_start = time.time()
for index, row in trip_od.iterrows():
try:
sp = nx.shortest_path(g, row['START INT'], row['END INT'], weight='length') # dijkstra algorithm
pathGraph = nx.path_graph(sp) # assigning subgraph
for ea in pathGraph.edges(): # appending edges in shortest path to list
route_table.append([g.edges[ea[0], ea[1]]['key'], row['trips']])
except:
unroutable.append([row['START INT'], row['END INT'], row['trips']])
if index%5000 == 0:
print(index, time.time() - next_start)
next_start = time.time()
print(time.time() - start)
After running for 30 minutes, we successfully routed almost every pair. However, 12.7% of all trips could not find a path.
For this project, we didn't spend any time investigating the unroutable trips, but from the results, it seemed like only Old Toronto trips were successfully routed.
unroutable_df = pd.DataFrame.from_records(unroutable, columns = ['START INT', 'END INT', 'trips'])
round(unroutable_df['trips'].sum()/trip_od['trips'].sum()*100,3)
pd.DataFrame.from_records(route_table).to_csv('routes.csv') # saving the file
centreline_counts = pd.DataFrame.from_records(route_table)
centreline_counts.columns = ['GEO_ID', 'volume']
To condense the table, we'll group by each GEO_ID, and then sum up all the trips occuring on that segment to get a column we'll name volume. Then we'll merge it back onto the centreline layer to grab other data we need.
centreline_counts = centreline_counts.groupby('GEO_ID').sum().sort_values(by = 'volume', ascending = False).reset_index()
centreline_counts = centreline.merge(centreline_counts, how = 'inner')
For a later chart, lets find the top 500 road segments for bikeshare users.
top_500 = centreline_counts.sort_values(by = 'volume', ascending = True).tail(500)
to_boundary = gpd.read_file('toronto.geojson')
to_boundary = to_boundary.to_crs(epsg = '26917')
for index, row in to_boundary.iterrows():
rotated = shapely.affinity.rotate(row['geometry'], angle=-17, origin = Point(0, 0))
to_boundary.at[index, 'geometry'] = rotated
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1) #to clean up the chloropleth legend
centreline_counts.plot(ax = ax, column = 'volume', cmap = 'inferno_r', legend = True, linewidth = 0.75, zorder = 2,
legend_kwds={'label': "Volume of Cyclists"}, cax = cax, vmin = -0, vmax = 120000)
ax.set_title('Predicted volumes of BikeShare trips in 2019, using dijkstra shortest path algorithm', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_axis_off()
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
This inital chloropleth plot of the busyness of each segment is a little overplotted. Almost every road in Old Toronto has a trip, and its very likely that these roads only receive only a few trips for all of 2019.
ylim = ax.get_ylim() #saving the extents for a later map
xlim = ax.get_xlim()
Lets create a simple histogram to understand the distribution of our data.
fig, (ax1, ax2) = plt.subplots(1,2)
centreline_counts.hist('volume', bins = 250, ax = ax1)
centreline_counts.hist('volume', bins = 250, ax = ax2)
ax1.set_xlim([0,1000])
This histogram that I created confirms that picutre. The right plot shows the entire histogram, while the left shows only the 0 to 1000 section. As a solution, lets only plot the top quartile of road segments.
centreline_counts['volume'].quantile([0.25,0.5,0.75])
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
ax.set_facecolor('xkcd:sky blue')
cax = divider.append_axes("right", size="5%", pad=0.1) #to clean up the chloropleth legend
centreline[centreline['FCODE_DESC'].isin(['Expressway', 'Expressway Ramp', 'Major Arterial Ramp', 'Minor Arterial Ramp'])].plot(
zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.5, alpha = 0.5)
centreline[centreline['FCODE_DESC'] == 'Major Arterial'].plot(zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.3, alpha = 0.5)
centreline[centreline['FCODE_DESC'] == 'Minor Arterial'].plot(zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.3, alpha = 0.3)
to_boundary.buffer(40).plot(zorder = 0, color = '#fffefc', ax = ax)
#to_boundary.plot(zorder = 0, color = 'w', ax = ax)
centreline_counts[centreline_counts['volume'] > 5107].plot(ax = ax, column = 'volume', cmap = 'inferno_r',
legend = True, linewidth = 0.75, zorder = 2,
legend_kwds={'label': "Volume of Cyclists"}, cax = cax, vmin = 0, vmax = 120000)
plt.suptitle('Downtown and the Waterfront have the busiest roads for cyclists', fontsize = 12, fontweight = 'bold', y = 0.94)
ax.set_title('Predicted volumes of BikeShare trips in 2019, using dijkstra shortest path algorithm, top 25% of roads only', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_xticks([])
ax.set_yticks([])
#ax.set_axis_off()
ax.set_ylim(ylim)
ax.set_xlim(xlim)
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
That's a lot better. From the map, we can see that key bike lanes, such as Sherbourne, the Martin Goodman Trail, Simcoe, Bloor, Beverley/St George, Brunswick, and Hoskin/Harbord/Wellesley dominate the counts. Overall, like with the trips by neighbourhood map, bikeshare trips are concentrated downtown, with some activity spreading out along the waterfront.
There are some interesting notes. Some roads with no bike lanes, such as The Esplanade and Victoria Street have high ridership. Our process favours residential roads, so those roads acted as a by pass for users on Yonge and King, even if those residential roads have no levels of bike infrastructure.
Another note is the section of the Martin Goodman Trail by Roncy. Because we didn't consider elevation changes, it routed cyclists on the bridge over the Gardiner, however this doesn't match reality. This is likely also the case for why the Riverdale Bridge over the DVP has more activity than Dundas, Gerrard, or the Prince Edward Viaduct.
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
divider = make_axes_locatable(ax)
ax.set_facecolor('xkcd:sky blue')
cax = divider.append_axes("right", size="5%", pad=0.1)
centreline[centreline['FCODE_DESC'].isin(['Expressway', 'Expressway Ramp', 'Major Arterial Ramp', 'Minor Arterial Ramp'])].plot(
zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.5, alpha = 0.5)
centreline[centreline['FCODE_DESC'] == 'Major Arterial'].plot(zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.3, alpha = 0.5)
centreline[centreline['FCODE_DESC'] == 'Minor Arterial'].plot(zorder = 1, color = 'xkcd:grey', ax = ax, linewidth = 0.3, alpha = 0.3)
to_boundary.buffer(40).plot(zorder = 0, color = '#fffefc', ax = ax)
#to_boundary.plot(zorder = 0, color = 'w', ax = ax)
top_500.plot(ax = ax, column = 'volume', cmap = 'inferno_r',
legend = True, linewidth = 3, zorder = 2,
legend_kwds={'label': "Volume of Cyclists"}, cax = cax, vmin = 20000, vmax = 120000)
plt.suptitle('Simcoe, Martin Goodman Trail, Bloor, Sherbourne,\nWellesley, Beverley, and Esplanade are Cycling Hotspots', fontsize = 12, fontweight = 'bold', y = 1.02)
ax.set_title('Top 500 road segments for bikeshare trips, using dijkstra shortest path algorithm', fontsize = 8)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_xticks([])
ax.set_yticks([])
#ax.set_axis_off()
ax.set_ylim([4436000, 4442000])
ax.set_xlim([2011000,2021000])
ax.add_artist(ScaleBar(1, location = 'lower right', length_fraction = 0.25))# scale bar
The top 500 road segments confirms the map above, except we can clearly see the key corridors that have a lot of bike traffic. An obvious place for the city to create more bike infrastructure is to create another crossing under the Gardiner/Rail corridor, to relieve traffic on Simcoe (such as extending the Bay Street bike lanes). Another east downtown North South Bike lane may also be attractive since Sherbourne and Victoria streets are pretty busy.
Anecdotally, the Queen's Quay section of the Martin Goodman Trail has always been the slowest and busiest section, and this map proces quantitatively that it may be worth looking into another way to relieve bike traffic around the ferry terminal.
We'll use another concept from transportation research called vehicles kilometers traveled, or VKT for short. We can't easily sum up the volumes that lie on the same street, but we can use VKT to account for distance when grouping road segments into road corridors.
VKT is simply the volume of cyclists a segment has multiplied by the length of that segment.
centreline_counts['vkt'] = centreline_counts['volume']*centreline_counts['length']/1000
This allows us to sum up te VKT grouped by road. Lets find out the top 10 corridors for bikeshare VKT.
road_top10 = centreline_counts.groupby('LF_NAME').sum()[['vkt']].sort_values(ascending = False, by = 'vkt').head(10)
road_top10 = road_top10.reset_index()
We'll also want to map the bike infrastructure to each road.
bikeways_unique = bikeways.sort_values(by = 'STREET_NAME')[['STREET_NAME', 'INFRA_HIGHORDER']].drop_duplicates(keep = 'first')
road_top10 = road_top10.merge(bikeways_unique, left_on = 'LF_NAME', right_on = 'STREET_NAME', how = 'left')
road_top10
Some roads have multiple levels of bike infrasturcture because of different segments. We'll manually choose the most appropriate, but for a more comprehensize analysis, we'll ideally want an automated process.
Similarly, we'll need to manually clean up the INFRA_HIGHORDER column.
road_top10['INFRA_HIGHORDER'] = np.where(road_top10['INFRA_HIGHORDER'] == 'MUT (2016 Network Plan/2012 Trails Plan)',
'Multi-Use Trail', road_top10['INFRA_HIGHORDER'])
road_top10['INFRA_HIGHORDER'] = np.where(road_top10['INFRA_HIGHORDER'] == 'Contraflow Cycle Track',
'Cycle Track', road_top10['INFRA_HIGHORDER'])
road_top10['INFRA_HIGHORDER'] = np.where(road_top10['INFRA_HIGHORDER'] == 'Sharrows - Arterial',
np.nan, road_top10['INFRA_HIGHORDER'])
road_top10['INFRA_HIGHORDER'] = np.where(road_top10['LF_NAME'] == 'Yonge St',
np.nan, road_top10['INFRA_HIGHORDER'])
road_top10['INFRA_HIGHORDER'] = np.where(road_top10['LF_NAME'] == 'Lake Shore Blvd East Trl',
'Multi-Use Trail', road_top10['INFRA_HIGHORDER'])
road_top10 = road_top10.sort_values(by = 'INFRA_HIGHORDER').drop_duplicates(['LF_NAME', 'vkt'], keep = 'first'
).sort_values(by = 'vkt', ascending = False)
road_top10['INFRA_HIGHORDER'] = road_top10['INFRA_HIGHORDER'].fillna('None') # so that we get a legend colour
road_top10 = road_top10.rename(columns = {'INFRA_HIGHORDER':'Bike Infrastructure'})
road_top10
centreline_counts['vkt'].sum()
fig, ax = plt.subplots()
fig.set_size_inches(7,4)
sns.barplot(y = 'LF_NAME', x = 'vkt', data = road_top10, ax = ax, orient = 'h', hue = 'Bike Infrastructure', dodge = False)
ax.set_yticks(range(10))
ax.set_xticks(range(0,700000,100000))
ax.set_ylabel('Road/Trail')
ax.set_xlabel('Vehicle Kilometers Travelled')
ax.set_title('Bikeshare Traffic in 2019, by Road', fontsize = 10)
plt.suptitle('The Martin Goodman Trail is the busiest bike path', fontweight = 'bold', fontsize = 12)
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
ax.set_yticklabels(road_top10['LF_NAME'])
plt.show()
Its clear that the Martin Goodman Trail has high VKT and high traffic, and thats one of the main reasons why West End Toronto is number 2 on the busiest boroughs for Bikeshare riders. Having 11.2% of the total 4.7 million VKT in the city (or in other words, 11.2% of total bikeshare traffic), this shows that its one of the most important corridors in the city for east west travel.
Most of the other top corridors have some level of bike infrastructure with only Victoria Street and The Esplanade not having any (although The Esplanade was part of ActiveTO and is in the planning process to get permanent bikelanes).
Its unclear how much riders are actually using Vicotria Street, as opposed to Yonge Street (due to the way we prioritized routes), but this does make the case for additional bike infrastructure in the yongeTOmorrow initative by the city.